16S-rDNA-V3-V4 repository: https://github.com/ycl6/16S-rDNA-V3-V4
export2graphlan: GitHub, export2graphlan
GraPhlAn: GitHub, Documentation, Paper
Demo Dataset: PRJEB27564 from Gut Microbiota in Parkinson’s Disease: Temporal Stability and Relations to Disease Progression. EBioMedicine. 2019;44:691-707
License: GPL-3.0
Note: Requires Python 2
Rcd /ngs/16S-Demo/PRJEB27564
R
library("data.table")
library("phyloseq")
library("ggplot2")
library("dplyr")
library("grid")
Sys.setlocale("LC_COLLATE", "C")## [1] "C"
Set the full-path to additional scripts
Create a folder lefse
# Patient vs. control at baseline
ps1a = prune_samples(sample_names(ps1)[grep("B$", sample_names(ps1))], ps1)
ps1a = prune_taxa(taxa_sums(ps1a) > 0, ps1a)
# Patient followup vs. patient baseline
ps1b = prune_samples(sample_names(ps1)[grep("^P", sample_names(ps1))], ps1)
ps1b = prune_taxa(taxa_sums(ps1b) > 0, ps1b)# Patient vs. control at baseline
ps1a = prune_samples(sample_names(ps1)[grep("B$", sample_names(ps1))], ps1)
ps1a = prune_taxa(taxa_sums(ps1a) > 0, ps1a)
# Patient followup vs. patient baseline
ps1b = prune_samples(sample_names(ps1)[grep("^P", sample_names(ps1))], ps1)
ps1b = prune_taxa(taxa_sums(ps1b) > 0, ps1b)
tax1 = lefse_1name_obj(ps1a, sample_data(ps1a)$Group)
lefse1 = lefse_obj(ps1a)
lefse1 = rbind(tax1, lefse1)
tax2 = lefse_1name_obj(ps1b, sample_data(ps1b)$Time)
lefse2 = lefse_obj(ps1b)
lefse2 = rbind(tax2, lefse2)
# Replace unsupported chars
lefse1$name = gsub("-","_",lefse1$name)
lefse1$name = gsub("/","_",lefse1$name)
lefse2$name = gsub("-","_",lefse2$name)
lefse2$name = gsub("/","_",lefse2$name) For Silva v132 users, apply below to fix identifical phylum & class names (Actinobacteria and Deferribacteres)
We use linear discriminant analysis (LDA) effect size (LEfSe) to determines the features (in this case the clades at each taxonomic rank) most likely to explain the differences between Parkinson’s patients and control subjects
system2(paste0(nsegata_lefse, "format_input.py"),
args = c("lefse/expr1.lefse_table.txt", "lefse/expr1.lefse_table.in",
"-c 1", "-u 2", "-o 1000000"))
system2(paste0(nsegata_lefse, "format_input.py"),
args = c("lefse/expr2.lefse_table.txt", "lefse/expr2.lefse_table.in",
"-c 1", "-u 2", "-o 1000000"))
# set KW alpha to 1 to allow returning of all P-value to perform adjustment later
system2(paste0(nsegata_lefse, "run_lefse.py"),
args = c("lefse/expr1.lefse_table.in", "lefse/expr1.lefse_table.res",
"-b 100", "-a 1", "-l 2"), stdout = TRUE)## [1] "Number of significantly discriminative features: 33 ( 252 ) before internal wilcoxon"
## [2] "Number of discriminative features with abs LDA score > 2.0 : 32"
system2(paste0(nsegata_lefse, "run_lefse.py"),
args = c("lefse/expr2.lefse_table.in", "lefse/expr2.lefse_table.res",
"-b 100", "-a 1", "-l 2"), stdout = TRUE)## [1] "Number of significantly discriminative features: 9 ( 251 ) before internal wilcoxon"
## [2] "Number of discriminative features with abs LDA score > 2.0 : 9"
Perform multiple testing correction on KW P-values
# set fdr threshold at 0.1
q = 0.1
expr1 = data.frame(fread("lefse/expr1.lefse_table.res"))
expr1 = pcorrection(expr1, q)
expr2 = data.frame(fread("lefse/expr2.lefse_table.res"))
expr2 = pcorrection(expr2, q)
# You can use table() to check number of taxa pass the threshold
table(expr1$V3 != "")##
## FALSE TRUE
## 222 32
##
## FALSE TRUE
## 245 9
# Write new result file with the P-value column replaced by the FDR
write.table(expr1[,c(1:4,6)], file = "lefse/expr1.lefse_table.res.padj", sep = "\t", quote = F,
row.names = F, col.names = F)
write.table(expr2[,c(1:4,6)], file = "lefse/expr2.lefse_table.res.padj", sep = "\t", quote = F,
row.names = F, col.names = F)Set the full-paths to export2graphlan, GraPhlAn and lefse.pl if they are not in PATH
export2graphlan and GraPhlAnNote: Update the coloring
lefse/expr1.colorsandlefse/expr2.colorsif necessary. The colors should be defined in HSV (hue, saturation, value) scale
Patient vs. control at baseline (expr1)
system2(export2graphlan,
args = c("-i lefse/expr1.lefse_table.txt", "-o lefse/expr1.lefse_table.res.padj",
"-t lefse/expr1.graphlan_tree.txt", "-a lefse/expr1.graphlan_annot.txt",
"--external_annotations 2,3,4,5,6", "--fname_row 0", "--skip_rows 1",
"--biomarkers2colors lefse/expr1.colors"))
system2(graphlan_annotat,
args = c("--annot lefse/expr1.graphlan_annot.txt",
"lefse/expr1.graphlan_tree.txt",
"lefse/expr1.graphlan_outtree.txt"))
system2(graphlan,
args = c("--dpi 150", "lefse/expr1.graphlan_outtree.txt", "lefse/expr1.graphlan.png",
"--external_legends", "--size 8", "--pad 0.2"))“lefse/expr1.graphlan.png”
Patient at baseline vs. patient at followup (expr2)
system2(export2graphlan,
args = c("-i lefse/expr2.lefse_table.txt", "-o lefse/expr2.lefse_table.res.padj",
"-t lefse/expr2.graphlan_tree.txt", "-a lefse/expr2.graphlan_annot.txt",
"--external_annotations 2,3,4,5,6", "--fname_row 0", "--skip_rows 1",
"--biomarkers2colors lefse/expr2.colors"))
system2(graphlan_annotat,
args = c("--annot lefse/expr2.graphlan_annot.txt",
"lefse/expr2.graphlan_tree.txt",
"lefse/expr2.graphlan_outtree.txt"))
system2(graphlan,
args = c("--dpi 150", "lefse/expr2.graphlan_outtree.txt", "lefse/expr2.graphlan.png",
"--external_legends", "--size 8", "--pad 0.2"))“lefse/expr2.graphlan.png”
Patient vs. control at baseline (expr1)
res1 = data.frame(fread("lefse/expr1.out"))
res1 = res1[order(res1$order),]
names(res1)[c(5:7)] = c("Group","LDA","FDR")
res1$taxon = paste0(res1$taxon, "(",res1$rank, ";", " ",res1$order, ")")
res1$taxon = as.factor(res1$taxon)
res1$taxon = factor(res1$taxon, levels = unique(res1$taxon[order(res1$order, decreasing = T)]))
res1$Group = as.factor(res1$Group)
levels(res1$taxon) = gsub("Escherichia_Shigella","Escherichia/Shigella",levels(res1$taxon))
levels(res1$taxon) = gsub("_UCG_","_UCG-",levels(res1$taxon))Patient at baseline vs. patient at followup (expr2)
res2 = data.frame(fread("lefse/expr2.out"))
res2 = res2[order(res2$order),]
names(res2)[c(5:7)] = c("Group","LDA","FDR")
res2$taxon = paste0(res2$taxon, "(",res2$rank, ";", " ",res2$order, ")")
res2$taxon = as.factor(res2$taxon)
res2$taxon = factor(res2$taxon, levels = unique(res2$taxon[order(res2$order, decreasing = T)]))
res2$Group = as.factor(res2$Group)
levels(res2$taxon) = gsub("Escherichia_Shigella","Escherichia/Shigella",levels(res2$taxon))
levels(res2$taxon) = gsub("_UCG_","_UCG-",levels(res2$taxon))Patient vs. control at baseline (expr1)
plot_lefse = ggplot(res1, aes(taxon, LDA, fill = Group)) +
geom_bar(stat = "identity", width = 0.7, size = 0.5) + coord_flip() +
theme_bw() + facet_wrap(~ Group, ncol = 1, scales = "free_y") +
scale_fill_manual(values = c("control" = "blue", "patient" = "red")) +
theme(legend.position = "none",
axis.text.x = element_text(size = 12),
axis.text.y = element_text(face = "bold", size = 8),
strip.text.x = element_text(face = "bold", size = 12)) +
labs(title = "LEfSe Analysis (Baseline)", x = "Taxon", y = "LDA")
groupN = res1 %>% group_by(Group) %>% summarise(count = length(unique(taxon)))## `summarise()` ungrouping output (override with `.groups` argument)
gt = ggplotGrob(plot_lefse)
panelI.1 = gt$layout$t[grepl("panel", gt$layout$name)]
gt$heights[panelI.1] = unit(groupN$count, "null")
invisible(dev.off())
png("lefse/expr1.lefse_table.png", width = 6, height = 6, units = "in", res = 300)
grid.draw(gt)
invisible(dev.off())“lefse/expr1.lefse_table.png”
Patient at baseline vs. patient at followup (expr2)
plot_lefse = ggplot(res2, aes(taxon, LDA, fill = Group)) +
geom_bar(stat = "identity", width = 0.7, size = 0.5) + coord_flip() +
theme_bw() + facet_wrap(~ Group, ncol = 1, scales = "free_y") +
scale_fill_manual(values = c("baseline" = "blue", "followup" = "red")) +
theme(legend.position = "none",
axis.text.x = element_text(size = 12),
axis.text.y = element_text(face = "bold", size = 8),
strip.text.x = element_text(face = "bold", size = 12)) +
labs(title = "LEfSe Analysis (followup)", x = "Taxon", y = "LDA")
groupN = res2 %>% group_by(Group) %>% summarise(count = length(unique(taxon)))## `summarise()` ungrouping output (override with `.groups` argument)
gt = ggplotGrob(plot_lefse)
panelI.1 = gt$layout$t[grepl("panel", gt$layout$name)]
gt$heights[panelI.1] = unit(groupN$count, "null")
invisible(dev.off())
png("lefse/expr2.lefse_table.png", width = 6, height = 3, units = "in", res = 300)
grid.draw(gt)
invisible(dev.off())“lefse/expr2.lefse_table.png”
## R version 3.6.2 (2019-12-12)
## Platform: x86_64-conda_cos6-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
##
## Matrix products: default
## BLAS/LAPACK: /home/ihsuan/miniconda3/envs/p27/lib/libopenblasp-r0.3.7.so
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C LC_TIME=en_GB.UTF-8
## [4] LC_COLLATE=C LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C LC_ADDRESS=C
## [10] LC_TELEPHONE=C LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] dplyr_1.0.0 ggplot2_3.3.2 phyloseq_1.30.0 data.table_1.12.8
## [5] knitr_1.29
##
## loaded via a namespace (and not attached):
## [1] nlme_3.1-148 bitops_1.0-6 matrixStats_0.56.0
## [4] bit64_0.9-7 RColorBrewer_1.1-2 GenomeInfoDb_1.22.1
## [7] tools_3.6.2 backports_1.1.8 R6_2.4.1
## [10] vegan_2.5-6 rpart_4.1-15 DBI_1.1.0
## [13] Hmisc_4.4-0 BiocGenerics_0.32.0 mgcv_1.8-31
## [16] colorspace_1.4-1 permute_0.9-5 ade4_1.7-15
## [19] nnet_7.3-14 withr_2.2.0 tidyselect_1.1.0
## [22] gridExtra_2.3 DESeq2_1.26.0 bit_1.1-15.2
## [25] compiler_3.6.2 Biobase_2.46.0 htmlTable_2.0.1
## [28] DelayedArray_0.12.3 scales_1.1.1 checkmate_2.0.0
## [31] genefilter_1.68.0 stringr_1.4.0 digest_0.6.25
## [34] Rsamtools_2.2.3 foreign_0.8-73 rmarkdown_2.3
## [37] dada2_1.14.1 XVector_0.26.0 base64enc_0.1-3
## [40] jpeg_0.1-8.1 pkgconfig_2.0.3 htmltools_0.5.0
## [43] highr_0.8 htmlwidgets_1.5.1 rlang_0.4.6
## [46] RSQLite_2.2.0 rstudioapi_0.11 generics_0.0.2
## [49] hwriter_1.3.2 jsonlite_1.7.0 BiocParallel_1.20.1
## [52] acepack_1.4.1 RCurl_1.98-1.2 magrittr_1.5
## [55] GenomeInfoDbData_1.2.2 Formula_1.2-3 biomformat_1.14.0
## [58] Matrix_1.2-18 Rcpp_1.0.5 munsell_0.5.0
## [61] S4Vectors_0.24.4 Rhdf5lib_1.8.0 ape_5.4
## [64] lifecycle_0.2.0 stringi_1.4.6 yaml_2.2.1
## [67] MASS_7.3-51.6 SummarizedExperiment_1.16.1 zlibbioc_1.32.0
## [70] rhdf5_2.30.1 plyr_1.8.6 blob_1.2.1
## [73] parallel_3.6.2 crayon_1.3.4 lattice_0.20-41
## [76] Biostrings_2.54.0 splines_3.6.2 annotate_1.64.0
## [79] multtest_2.42.0 locfit_1.5-9.4 pillar_1.4.5
## [82] igraph_1.2.5 GenomicRanges_1.38.0 geneplotter_1.64.0
## [85] reshape2_1.4.4 codetools_0.2-16 stats4_3.6.2
## [88] XML_3.98-1.20 glue_1.4.1 evaluate_0.14
## [91] ShortRead_1.44.3 latticeExtra_0.6-29 RcppParallel_5.0.2
## [94] png_0.1-7 vctrs_0.3.1 foreach_1.5.0
## [97] gtable_0.3.0 purrr_0.3.4 xfun_0.15
## [100] xtable_1.8-4 survival_3.2-3 tibble_3.0.2
## [103] iterators_1.0.12 memoise_1.1.0 AnnotationDbi_1.48.0
## [106] GenomicAlignments_1.22.1 IRanges_2.20.2 cluster_2.1.0
## [109] ellipsis_0.3.1